## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
pacman::p_load(
  # Data Manipulation & Cleaning
  tidyverse, janitor, fs, glue,

  # Data Visualization & Plotting
  ggthemes, ggridges, ggrepel, viridis, patchwork, scales,

  # Statistical Modeling & Analysis
  brms, BradleyTerry2, icr,

  # Utilities & Performance
  remotes, tictoc
)

## Colors for visualization
ger_color  <- c(
  "AfD" = "#00ADEF",
  "CDU/CSU" = "grey60",
  "FDP" = "#ebcc34",# "#ffed00",
  "Greens" = "#46962b",
  "SPD" = "#E2001A",
  "Left" = "#8B1A1A"
)

## ----Loading artefacts for appendix------------------------------------------------------------------------------------------------------------------------------------------------------
# Timestamps for each individual comparison
comp_timestamps <- read_rds("input/comp_with_timestamps.rds") %>%
  glimpse

# MP's info
mp_dt <- read_rds("input/mp_dt.rds") %>%
  glimpse()

# Raw parameter from Bayesian Davidson model
dav_bayesian_preds <- read_rds("data/0_main_params.rds") %>%
  mutate(param_type = str_extract(Parameter, ".*?(?=\\[|$)"),
         pageid = as.numeric(str_extract(Parameter, "(?<=\\[A)\\d+"))) %>%
  filter(param_type == "lambda") %>%
  transmute(pageid, mean = -Mean, low = -HPD_lower, high = -HPD_higher, Rhat) %>%
  left_join(mp_dt, by = "pageid") %>%
  mutate(party = fct_reorder(party, mean)) %>%
  glimpse

# Raw parameter from Bayesian BT model
bt_bayesian_preds <- read_rds("data/1_param_bt.rds") %>%
  mutate(param_type = str_extract(Parameter, ".*?(?=\\[|$)"),
         pageid = as.numeric(str_extract(Parameter, "(?<=\\[A)\\d+"))) %>%
  filter(param_type == "lambda") %>%
  transmute(pageid, mean_bt = -Mean, low_bt = -HPD_lower, high_bt = -HPD_higher) %>%
  glimpse

# Raw parameter from Frequentist BT model
bt_freq_preds <- read_rds("data/2_frequentist_preds.rds") %>%
  select(pageid, mean_bt_f = mean_bt, low_bt_f = low_bt, high_bt_f = high_bt) %>%
  glimpse

merged_estimates <- dav_bayesian_preds %>%
  left_join(bt_bayesian_preds, by = join_by(pageid)) %>%
  left_join(bt_freq_preds, by = join_by(pageid)) %>%
  glimpse

# Load the simulation results
sim_results <- read_rds("input/0_simulations_results.rds") %>%
  mutate(
    n_coders = fct_reorder(factor(n_coders), n_coders),
    n_pairs = fct_reorder(factor(glue("{n_pairs} pairs")), n_pairs)
  ) %>%
  glimpse

bias_avg_dt <- sim_results %>%
  group_by(random, n_coders, n_pairs, sort_units, prob_error) %>%
  summarise(
    across(c(cor, prop_found), list(mean = mean, sd = sd), na.rm = TRUE),
    n = n(),
    .groups = "drop"
  ) %>%
  mutate(random = ifelse(random, "rand", "algo")) %>%
  pivot_wider(
    id_cols = c(n_coders, n_pairs, sort_units, prob_error),
    names_from = random,
    values_from = c(cor_mean, prop_found_mean, cor_sd, prop_found_sd, n)
  ) %>%
  mutate(
    cor_diff = cor_mean_rand - cor_mean_algo,
    prop_found_diff = prop_found_mean_rand - prop_found_mean_algo,
    cor_se = sqrt(cor_sd_rand^2 / n_rand + cor_sd_algo^2 / n_algo),
    prop_found_se = sqrt(prop_found_sd_rand^2 / n_rand + prop_found_sd_algo^2 / n_algo),
    cor_low = cor_diff - 1.96 * cor_se,
    cor_high = cor_diff + 1.96 * cor_se,
    prop_found_low = prop_found_diff - 1.96 * prop_found_se,
    prop_found_high = prop_found_diff + 1.96 * prop_found_se
  ) %>%
  glimpse



## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
median_time_per_comp <- comp_timestamps %>%
  summarise(median_time_diff = median(time_diff, na.rm = TRUE)) %>%
  round(1)

total_time <- comp_timestamps %>%
  # Removing any interval larger than 10 minutes
  filter(time_diff < 10*60) %>%
  summarise(total_time = sum(time_diff, na.rm = TRUE) / 60, .by = user) %>%
  summarise(total_mean_time = mean(total_time)) %>%
  round(1)

print(glue("The median time required to complete a comparison was {median_time_per_comp} seconds. Completing the survey required around {total_time} minutes."))


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
comp_timestamps %>%
  mutate(
    time = (time - min(time))/60/60,
    longer = time > 24,
    time = ifelse(longer, 24, time),
    .by = user
  ) %>%
  ggplot(aes(x =  user, y = time, colour = longer, shape = longer)) +
  geom_point(show.legend = F) +
  coord_flip() +
  theme_bw() +
  scale_colour_manual(values = c("black", "red")) +
  labs(y = "Time elapsed since the first comparison (in hours)", x = "Respondent")

ggsave(filename = "plot/A2_comparaison_over_time.png", width = 8, height = 5)

## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
sim_results %>%
  mutate(sort_units = ifelse(sort_units, "Sorted", "Non-sorted"),
         random = ifelse(random, "Random", "Custom algorithm")) %>%
  ggplot(aes(x = cor, prop_found, color = n_pairs)) +
  geom_point(size = .5, alpha = .4) +
  facet_grid(n_coders~random) +
  scale_color_viridis(discrete = T, direction = -1, begin = 0, end = .9) +
  theme_minimal() +
  labs(x = "Correlation with Ground Truth", y = "% of recovered MPs", color = "") +
  scale_y_continuous(labels = percent) +
  theme(plot.background = element_rect(fill = "white", colour = NA),
        panel.background = element_rect(fill = "white", colour = NA))

ggsave(filename = "plot/B3_sim_raw.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

bias_avg_dt %>%
  ggplot(aes(x = cor_diff, y = prop_found_diff, color = n_coders)) +
  geom_hline(yintercept = 0, linetype = 4, color = "grey70") +
  geom_vline(xintercept = 0, linetype = 4, color = "grey70") +
  geom_pointrange(aes(xmin = cor_low, xmax = cor_high), size = .03) +
  geom_errorbar(aes(ymin = prop_found_low, ymax = prop_found_high)) +
  facet_wrap(~n_pairs) +
  labs(
    x = "\nCorrelation with GT (Random - Algorithm)\n[Negative means algorithm is better correlated with ground truth.]",
    y = "% recovered units (Random - Algorithm)\n[Negative means algorithm recovers more units.]\n",
    color = "Number of simulated coders"
  ) +
  scale_color_viridis(discrete = T, direction = -1) +
  theme_minimal() +
  theme(plot.background = element_rect(fill = "white", colour = NA),
        panel.background = element_rect(fill = "white", colour = NA))

ggsave(filename = "plot/B4_sim_agg.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
merged_estimates %>%
  ggplot(aes(x = Rhat)) +
  geom_histogram(bins = 43 ) +
  theme_bw() +
  labs(x = "\nGelman-Rubin R-hat statistics", y = "Number of parameter\n")

ggsave(filename = "plot/C5_rhat.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

cor_dav_bt <- round(cor(merged_estimates$mean, merged_estimates$mean_bt,use = "complete.obs"), 2)
print(glue("Correlation between Davidson and Bradley-Terry model: {cor_dav_bt}"))


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
merged_estimates %>%
  ggplot(aes(x = mean,  y = mean_bt, color = party)) +
  geom_point() +
  facet_wrap(~party, scales = "free") +
  theme_minimal() +
  scale_color_manual(values = ger_color) +
  guides(color = "none") +
  labs(x = "\nDavidson model\n(including 'similar positions')", y = "\nBradley-Terry model\n(excluding 'similar positions')") +
  theme(plot.background = element_rect(fill = "white", colour = NA),
        panel.background = element_rect(fill = "white", colour = NA))

ggsave(filename = "plot/D6_comp_davidson_bt.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
merged_estimates %>%
  mutate(unc = low - high,
         unc_bt =  low_bt - high_bt) %>%
  ggplot(aes(x = unc,  y = unc_bt, color = party)) +
  geom_point() +
  facet_wrap(~party) +
  theme_minimal() +
  scale_color_manual(values = ger_color) +
  coord_equal(xlim = c(0, 10), ylim = c(0, 10)) +
  guides(color = "none") +
  geom_abline(slope = 1, linetype = 4, color = "grey60") +
  labs(x = "\nDavidson model\n(including 'similar positions')", y = "\nBradley-Terry model\n(excluding 'similar positions')") +
  theme(plot.background = element_rect(fill = "white", colour = NA),
        panel.background = element_rect(fill = "white", colour = NA))

ggsave(filename = "plot/D7_width_credible_intervals.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
p1 <- merged_estimates %>%
  select(pageid, mean, low, high) %>%
  ggplot(aes(x = mean)) +
  geom_histogram() +
  theme_minimal() +
  labs(y = "Davidson model\n(Bayesian)", x = "Estimates") +
  theme(plot.background = element_rect(fill = "white", colour = NA),
        panel.background = element_rect(fill = "white", colour = NA))

p2 <- merged_estimates %>%
  select(pageid, mean_bt_f) %>%
  ggplot(aes(x = mean_bt_f)) +
  geom_histogram() +
  theme_minimal() +
  labs(y = "Bradley-Terry model\n(Frequentist)", x = "Estimates") +
  theme(plot.background = element_rect(fill = "white", colour = NA),
        panel.background = element_rect(fill = "white", colour = NA))

p1/p2

ggsave(filename = "plot/D8_benchmark_freq_hist.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

merged_estimates %>%
  ggplot(aes(x = mean,  y = mean_bt_f, color = party)) +
  geom_point() +
  facet_wrap(~party, scales = "free") +
  theme_minimal() +
  scale_color_manual(values = ger_color) +
  guides(color = "none") +
  labs(x = "\nDavidson model\n(including 'similar positions')", y = "\nBradley-Terry model\n(excluding 'similar positions')") +
  theme(plot.background = element_rect(fill = "white", colour = NA),
        panel.background = element_rect(fill = "white", colour = NA))

ggsave(filename = "plot/D9_benchmark_freq_est.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
merged_estimates %>%
  drop_na(gender) %>%
  transmute(
    mean,
    party,
    gender = fct_reorder(as.character(gender), mean),
    party_sex = paste(party, gender) %>%
      fct_reorder(as.numeric(as.factor(party)))
  ) %>%
  ggplot(aes(x = party_sex, y = mean, group = party_sex, fill = party)) +
  geom_boxplot(aes(alpha = gender), show.legend = F, outlier.alpha = 0) +
  scale_fill_manual(values = ger_color) +
  theme_bw() +
  coord_flip() +
  guides(fill = F) +
  labs(y = "Ideological Position", x = "")

ggsave(filename = "plot/E11_gender.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
merged_estimates %>%
  filter(!party %in% c("FDP")) %>%
  drop_na(constituency) %>%
  transmute(
    mean,
    party,
    direct = ifelse(constituency != "Landesliste", "Constituency", "Party list") %>%
      fct_reorder(mean),
    party_direct = paste(party, direct) %>%
      fct_reorder(as.numeric(as.factor(party)))
  ) %>%
  ggplot(aes(x = party_direct, y = mean, alpha = direct, group = party_direct, fill = party)) +
  geom_boxplot(aes(alpha = direct), show.legend = F, outlier.alpha = 0) +
  scale_fill_manual(values = ger_color) +
  scale_color_viridis(discrete = T) +
  theme_bw() +
  coord_flip() +
  guides(fill = "none") +
  labs(y = "Ideological Position", x = "", fill = "")

ggsave(filename = "plot/E12_constituency_vs_list_mps.png", width = 8, height = 5)


## ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
ost <- c("Mecklenburg-Vorpommern", "Brandenburg", "Sachsen", "Thüringen", "Sachsen-Anhalt", "Berlin")

merged_estimates %>%
  mutate(east = ifelse(region %in% ost, "- East", "- West")) %>%
  mutate(faction = paste(party, east, glue("({n()})")), .by = c(party, east)) %>%
  filter(!is.na(faction)) %>%
  mutate(faction = fct_reorder(faction, as.numeric(party))) %>%
  ggplot(aes(x = mean, y = faction, fill = party)) +
  geom_boxplot(show.legend = F, outlier.alpha = 0) +
  scale_fill_manual(values = ger_color) +
  theme_bw() +
  labs(y = "", x = "Ideological Position")

ggsave(filename = "plot/E13_east_divide.png", width = 8, height = 5)

